library(dplyr)
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
glimpse(movies_dataset)
Observations: 2,000
Variables: 2
$ class <chr> "Pos", "Pos", "Pos", "Pos", "Pos", "Pos", "Pos", "Pos", "Pos", …
$ text  <chr> "films adapted from comic books have had plenty of success   wh…

Bag of Words Tokenisation

In this approach, we represent each word in a document as a token (or feature) and each document as a vector of features. In addition, for simplicity, we disregard word order and focus only on the number of occurrences of each word i.e., we represent each document as a multi-set ‘bag’ of words.


dtm<-movies_dataset %>% select(-class)   %>% 
  mutate(row=row_number())  

dtm <- dtm %>% unnest_tokens(word,text) %>% group_by(word,row) %>% summarise(total=n()) %>% cast_sparse(row,word,total)

dtm
str(as.matrix(dtm))
as.matrix(dtm)[1:2,2000:2030] 

Remove Stop Words

i

Term frequency

The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites.


dtm<-movies_dataset %>% select(-class)  %>% sample_n(100) %>% 
  mutate(row=row_number()) %>% unnest_tokens(word,text) %>% group_by(word,row) %>% 
  summarise(total=n()) %>%
  anti_join(stop_words)
row_words <- dtm%>% count(row,word, sort=TRUE)
total_words <- dtm %>% group_by(row) %>% summarise(total=n())

inner_join(row_words,total_words) %>%   bind_tf_idf(word, row, n)
LS0tCnRpdGxlOiAiVGlkeVRleHQgYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KYGBge3J9CmxpYnJhcnkocmVhZHIpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodGlkeXRleHQpCmBgYApgYGB7cn0KbW92aWVzX2RhdGFzZXQ8LXJlYWRfY3N2KCJkYXRhL21vdmllLXBhbmcwMi5jc3YuZ3oiKQpnbGltcHNlKG1vdmllc19kYXRhc2V0KQpoZWFkKG1vdmllc19kYXRhc2V0KQoKYGBgCiMgQmFnIG9mIFdvcmRzIFRva2VuaXNhdGlvbgoKSW4gdGhpcyBhcHByb2FjaCwgd2UgcmVwcmVzZW50IGVhY2ggd29yZCBpbiBhIGRvY3VtZW50IGFzIGEgdG9rZW4gKG9yIGZlYXR1cmUpIGFuZCBlYWNoIGRvY3VtZW50IGFzIGEgdmVjdG9yIG9mIGZlYXR1cmVzLiBJbiBhZGRpdGlvbiwgZm9yIHNpbXBsaWNpdHksIHdlIGRpc3JlZ2FyZCB3b3JkIG9yZGVyIGFuZCBmb2N1cyBvbmx5IG9uIHRoZSBudW1iZXIgb2Ygb2NjdXJyZW5jZXMgb2YgZWFjaCB3b3JkIGkuZS4sIHdlIHJlcHJlc2VudCBlYWNoIGRvY3VtZW50IGFzIGEgbXVsdGktc2V0IOKAmGJhZ+KAmSBvZiB3b3Jkcy4KCgpgYGB7cn0KCmR0bTwtbW92aWVzX2RhdGFzZXQgJT4lIHNlbGVjdCgtY2xhc3MpICAgJT4lIAogIG11dGF0ZShyb3c9cm93X251bWJlcigpKSAgCgpkdG0gPC0gZHRtICU+JSB1bm5lc3RfdG9rZW5zKHdvcmQsdGV4dCkgJT4lIGdyb3VwX2J5KHdvcmQscm93KSAlPiUgc3VtbWFyaXNlKHRvdGFsPW4oKSkgJT4lIGNhc3Rfc3BhcnNlKHJvdyx3b3JkLHRvdGFsKQoKZHRtCnN0cihhcy5tYXRyaXgoZHRtKSkKYXMubWF0cml4KGR0bSlbMToyLDIwMDA6MjAzMF0gCgpgYGAKCiMgUmVtb3ZlIFN0b3AgV29yZHMKCmBgYHtyfQpkYXRhKHN0b3Bfd29yZHMpCmhlYWQoc3RvcF93b3JkcykKbW92aWVzX2RhdGFzZXQ8LXJlYWRfY3N2KCJkYXRhL21vdmllLXBhbmcwMi5jc3YuZ3oiKQoKbW92aWVzX2RhdGFzZXQ8LW1vdmllc19kYXRhc2V0ICAlPiUgIG11dGF0ZShyb3dudW1iZXI9cm93X251bWJlcigpKSAKCmR0bTwtbW92aWVzX2RhdGFzZXQgJT4lIHNlbGVjdCgtY2xhc3MpICU+JSB1bm5lc3RfdG9rZW5zKHdvcmQsdGV4dCkgJT4lIGdyb3VwX2J5KHdvcmQscm93bnVtYmVyKSAlPiUgCiAgc3VtbWFyaXNlKHRvdGFsPW4oKSkgJT4lCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpCgpkdG08LSBkdG0gJT4lIGNhc3Rfc3BhcnNlKHJvd251bWJlcix3b3JkLHRvdGFsKQpkdG08LWFzLm1hdHJpeChkdG0pICU+JSBhcy5kYXRhLmZyYW1lKCkKCgoKCmR0bTwtaW5uZXJfam9pbihtb3ZpZXNfZGF0YXNldCxkdG0sYnk9Yygncm93JykpICU+JSBzZWxlY3QoLXRleHQueCkKZHRtICU+JSBzZWxlY3Qocm93bnVtYmVyKQpgYGAKYGBge3J9CmkKYGBgCgojIFRlcm0gZnJlcXVlbmN5ClRoZSBzdGF0aXN0aWMgdGYtaWRmIGlzIGludGVuZGVkIHRvIG1lYXN1cmUgaG93IGltcG9ydGFudCBhIHdvcmQgaXMgdG8gYSBkb2N1bWVudCBpbiBhIGNvbGxlY3Rpb24gKG9yIGNvcnB1cykgb2YgZG9jdW1lbnRzLCBmb3IgZXhhbXBsZSwgdG8gb25lIG5vdmVsIGluIGEgY29sbGVjdGlvbiBvZiBub3ZlbHMgb3IgdG8gb25lIHdlYnNpdGUgaW4gYSBjb2xsZWN0aW9uIG9mIHdlYnNpdGVzLiAKCgpgYGB7cn0KCmR0bTwtbW92aWVzX2RhdGFzZXQgJT4lIHNlbGVjdCgtY2xhc3MpICAlPiUgc2FtcGxlX24oMTAwKSAlPiUgCiAgbXV0YXRlKHJvdz1yb3dfbnVtYmVyKCkpICU+JSB1bm5lc3RfdG9rZW5zKHdvcmQsdGV4dCkgJT4lIGdyb3VwX2J5KHdvcmQscm93KSAlPiUgCiAgc3VtbWFyaXNlKHRvdGFsPW4oKSkgJT4lCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpCnJvd193b3JkcyA8LSBkdG0lPiUgY291bnQocm93LHdvcmQsIHNvcnQ9VFJVRSkKdG90YWxfd29yZHMgPC0gZHRtICU+JSBncm91cF9ieShyb3cpICU+JSBzdW1tYXJpc2UodG90YWw9bigpKQoKaW5uZXJfam9pbihyb3dfd29yZHMsdG90YWxfd29yZHMpICU+JSAgIGJpbmRfdGZfaWRmKHdvcmQsIHJvdywgbikKCgpgYGAKCgo=